Objective: General Analysis


1. Set Up

README

In general, the data sets are relatively large, and take up a large amount of memory after loading. It is recommended that after loading, that the data sets be subsetted or summarised, and unused data frames to be removed.

The code is shown in the next three tabs to show any changes that may have been made. In general, no columns or rows are removed. Filtering occured in the next step.


General

library(dplyr)
library(ggplot2)
library(extrafont)
library(stringr)
loadfonts()
library(tidyr)
library(DT)
library(tidytext)
library(psych)
library(sf)
library(leaflet)
library(ggpubr)

# Theme setting for visualisations
theme_set(theme_minimal() + 
            theme(panel.grid.minor = element_blank(),
                  text = element_text(family = "Quicksand",
                                      colour = "black")))
options(scipen = 999)

# Source occ_modifier
source("0_OCC_Modifier.R")

Loading 1850

# Loading
mn_1850 = readr::read_csv("../Data/census_1850_occ_mn.csv") %>% mutate(city = "Manhattan")
## Parsed with column specification:
## cols(
##   .default = col_double(),
##   occstr = col_character(),
##   bplstr = col_character(),
##   us1850c_1053 = col_character(),
##   stdcity = col_character()
## )
## See spec(...) for full column specifications.
bk_1850 = readr::read_csv("../Data/census_1850_occ_bk.csv") %>% mutate(city = "Brooklyn")
## Parsed with column specification:
## cols(
##   .default = col_double(),
##   occstr = col_character(),
##   bplstr = col_character(),
##   us1850c_1053 = col_character(),
##   stdcity = col_character()
## )
## See spec(...) for full column specifications.
OCC_1880 = readr::read_csv("../Data/OCC1880.csv")
## Parsed with column specification:
## cols(
##   code = col_double(),
##   occ_label = col_character()
## )
OCC_1950 = readr::read_csv("../Data/OCC1950.csv")
## Parsed with column specification:
## cols(
##   code = col_double(),
##   occ1950_label = col_character()
## )
# Combining Data
combined_1850 = rbind(mn_1850, bk_1850) %>% 
  select(age, sex, race, labforce,
         occ, city, occ1950, occstr, 
         enumdist, ward, ind1950) %>%
  left_join(OCC_1880, by = c("occ" = "code")) %>%
  left_join(OCC_1950, by = c("occ1950" = "code")) %>%
  mutate(race = factor(race,
                       levels = c(100,120,200,210,300),
                       labels = c("White", "White",
                                  "Black/African American/Negro",
                                  "Mulatto", 
                                  "American Indian/Alaska Native (AIAN)")),
         labforce = factor(labforce,
                           levels = c(0,1,2),
                           labels = c("N/A",
                                      "No, not in the labor force",
                                      "Yes, in the labor force")),
         sex = factor(sex,
                      levels = c(1,2),
                      labels = c("Male", "Female")),
         year = 1850) %>%
  
  # Data prepping for combining
  filter(labforce == "Yes, in the labor force") %>%
  select(year, age, sex, race, city, enumdist, 
         occstr, occ_label, occ1950_label, ward, ind1950) %>%
  mutate(race = as.character(race),
         sex = as.character(sex))

# Saving Memory
rm(bk_1850)
rm(mn_1850)


Loading 1880

# Loading
mn_1880 = readr::read_csv("../Data/census_1880_occ_mn.csv") %>% mutate(city = "Manhattan")
bk_1880 = readr::read_csv("../Data/census_1880_occ_bk.csv") %>% mutate(city = "Brooklyn")

# Combining Data
combined_1880 = rbind(mn_1880, bk_1880) %>% 
  select(age, sex, race, labforce,
         occ, city, occstr, occ1950, 
         enumdist) %>%
  left_join(OCC_1880, by = c("occ" = "code")) %>%
  mutate(age = ifelse(age == "Less than 1 year old",
                      0,
                      age),
         age = as.numeric(age),
         year = 1880) %>%
  
  # Data prepping for combining
  filter(labforce == "Yes, in the labor force") %>%
  select(year, age, sex, race, city, enumdist,
         occstr, occ_label, occ1950_label = occ1950)

rm(OCC_1880)
rm(bk_1880)
rm(mn_1880)


Loading 1910

# Loading
mn_1910 = readr::read_csv("../Data/census_1910_occ_mn.csv") %>% mutate(city = "Manhattan")
bk_1910 = readr::read_csv("../Data/census_1910_occ_bk.csv") %>% mutate(city = "Brooklyn")
OCC_1950 = readr::read_csv("../Data/OCC1950.csv")

# Combining Data
combined_1910 = rbind(mn_1910, bk_1910) %>%
  left_join(OCC_1950, by = c("occ1950" = "code")) %>%
  select(age, sex, labor_force, race, 
         occ1950, occstr, occ1950_label, city,
         enumdist) %>%
  mutate(sex = factor(sex,
                      levels = c(1,2),
                      labels = c("Male", "Female")),
         labor_force = factor(labor_force,
                              levels = c(0,1,2),
                              labels = c("N/A", 
                                         "No, not in the labor force",
                                         "Yes, in the labor force")),
         race = factor(race,
                       levels = c(100, 140, 200, 210, 300, 400, 500, 600, 672),
                       labels = c("White", "Mexican (1930)", 
                                  "Black/African American/Negro",
                                  "Mulatto", 
                                  "American Indian/Alaska Native (AIAN)",
                                  "Chinese",
                                  "Japanese",
                                  "Filipino",
                                  "Asian, not specified")),
         year = 1910) %>%
  
  # Data prepping for combining
  filter(labor_force == "Yes, in the labor force") %>%
  select(year, age, sex, race, city, enumdist,
         occstr, occ_label = occ1950_label) %>%
  mutate(occ1950_label = occ_label,
         sex = as.character(sex),
         race = as.character(race),
         enumdist = as.numeric(enumdist))

rm(OCC_1950)
rm(mn_1910)
rm(bk_1910)


Combining all time periods, applying occ_modifier

combined = 
  bind_rows(combined_1850, 
            combined_1880, 
            combined_1910) %>%
  mutate(occstr = occ_modifier(occstr))

rm(combined_1850)
rm(combined_1880)
rm(combined_1910)
readr::write_csv(combined, "combined.csv")
combined = readr::read_csv("../Data/combined.csv") %>%
  mutate(year     = as.factor(year),
         sex      = as.factor(sex),
         race     = as.factor(race),
         city     = as.factor(city))
## Parsed with column specification:
## cols(
##   year = col_double(),
##   age = col_double(),
##   sex = col_character(),
##   race = col_character(),
##   city = col_character(),
##   enumdist = col_double(),
##   occstr = col_character(),
##   occ_label = col_character(),
##   occ1950_label = col_character()
## )
# To reduce computation later
var_lf_1850 = combined %>% filter(year == 1850) %>% NROW()
var_lf_1880 = combined %>% filter(year == 1880) %>% NROW()
var_lf_1910 = combined %>% filter(year == 1910) %>% NROW()
var_lf = var_lf_1850 + var_lf_1880 + var_lf_1910


shp Files

top_n_occupations = function(n, year_in = "all") {
  
  if (year_in == "all") {
    
    return(
      combined %>%
        filter(occ1950_label != "Not yet classified") %>%
        group_by(occ1950_label) %>%
        summarise(count = n()) %>%
        arrange(desc(count)) %>%
        head(n) %>%
        .$occ1950_label
    ) 
  } else if (year_in %in% c("1850", "1880", "1910")) {
    
    return (
      combined %>%
        filter(year == year_in & occ1950_label != "Not yet classified") %>%
        group_by(occ1950_label) %>%
        summarise(count = n()) %>%
        arrange(desc(count)) %>%
        head(n) %>%
        .$occ1950_label
    )
  }
}
bk_1850 = st_read("../Data/shpfiles/bk_shapefiles/Ward_1850_BK.shp",
                  stringsAsFactors = FALSE) %>%
  select(Ward_Num, geometry)
## Reading layer `Ward_1850_BK' from data source `C:\Users\Clinton\Documents\GitHub\hnyc_occupations\Data\shpfiles\bk_shapefiles\Ward_1850_BK.shp' using driver `ESRI Shapefile'
## Simple feature collection with 11 features and 3 fields
## geometry type:  POLYGON
## dimension:      XY
## bbox:           xmin: -8240555 ymin: 4958954 xmax: -8226921 ymax: 4969322
## epsg (SRID):    3857
## proj4string:    +proj=merc +a=6378137 +b=6378137 +lat_ts=0.0 +lon_0=0.0 +x_0=0.0 +y_0=0 +k=1.0 +units=m +nadgrids=@null +wktext +no_defs
mn_1850 = st_read("../Data/shpfiles/mn_shapefiles/Ward_1850_MN.shp",
                  stringsAsFactors = FALSE) %>%
  select(Ward_Num, geometry)
## Reading layer `Ward_1850_MN' from data source `C:\Users\Clinton\Documents\GitHub\hnyc_occupations\Data\shpfiles\mn_shapefiles\Ward_1850_MN.shp' using driver `ESRI Shapefile'
## Simple feature collection with 19 features and 3 fields
## geometry type:  POLYGON
## dimension:      XY
## bbox:           xmin: -8239443 ymin: 4968339 xmax: -8227670 ymax: 4994306
## epsg (SRID):    3857
## proj4string:    +proj=merc +a=6378137 +b=6378137 +lat_ts=0.0 +lon_0=0.0 +x_0=0.0 +y_0=0 +k=1.0 +units=m +nadgrids=@null +wktext +no_defs
shp_1850 = rbind(bk_1850, mn_1850) %>%
  st_transform(crs = 4326)

rm(bk_1850)
rm(mn_1850)
bk_1880 = st_read("../Data/shpfiles/bk_shapefiles/ED_1880_S4_BK.shp",
                  stringsAsFactors = FALSE) %>% 
  select(ED, geometry)
## Reading layer `ED_1880_S4_BK' from data source `C:\Users\Clinton\Documents\GitHub\hnyc_occupations\Data\shpfiles\bk_shapefiles\ED_1880_S4_BK.shp' using driver `ESRI Shapefile'
## Simple feature collection with 250 features and 152 fields
## geometry type:  POLYGON
## dimension:      XY
## bbox:           xmin: -8240479 ymin: 4958979 xmax: -8226015 ymax: 4973979
## epsg (SRID):    3857
## proj4string:    +proj=merc +a=6378137 +b=6378137 +lat_ts=0.0 +lon_0=0.0 +x_0=0.0 +y_0=0 +k=1.0 +units=m +nadgrids=@null +wktext +no_defs
mn_1880 = st_read("../Data/shpfiles/mn_shapefiles/ED_1880_MN.shp",
                  stringsAsFactors = FALSE) %>%
  select(ED = ed80, geometry)
## Reading layer `ED_1880_MN' from data source `C:\Users\Clinton\Documents\GitHub\hnyc_occupations\Data\shpfiles\mn_shapefiles\ED_1880_MN.shp' using driver `ESRI Shapefile'
## Simple feature collection with 662 features and 41 fields
## geometry type:  POLYGON
## dimension:      XY
## bbox:           xmin: -8239612 ymin: 4968158 xmax: -8227670 ymax: 4994306
## epsg (SRID):    3857
## proj4string:    +proj=merc +a=6378137 +b=6378137 +lat_ts=0.0 +lon_0=0.0 +x_0=0.0 +y_0=0 +k=1.0 +units=m +nadgrids=@null +wktext +no_defs
shp_1880 = rbind(bk_1880, mn_1880) %>%
  st_transform(crs = 4326)

rm(bk_1880)
rm(mn_1880)
bk_1910 = st_read("../Data/shpfiles/bk_shapefiles/Brooklyn_1910.shp",
                  stringsAsFactors = FALSE)
## Reading layer `Brooklyn_1910' from data source `C:\Users\Clinton\Documents\GitHub\hnyc_occupations\Data\shpfiles\bk_shapefiles\Brooklyn_1910.shp' using driver `ESRI Shapefile'
## Simple feature collection with 1112 features and 1 field
## geometry type:  MULTIPOLYGON
## dimension:      XY
## bbox:           xmin: 1825539 ymin: 556929.4 xmax: 1841193 ymax: 576192.4
## epsg (SRID):    NA
## proj4string:    +proj=aea +lat_1=29.5 +lat_2=45.5 +lat_0=37.5 +lon_0=-96 +x_0=0 +y_0=0 +datum=NAD83 +units=m +no_defs
mn_1910 = st_read("../Data/shpfiles/mn_shapefiles/Manhattan_1910.shp",
                  stringsAsFactors = FALSE)
## Reading layer `Manhattan_1910' from data source `C:\Users\Clinton\Documents\GitHub\hnyc_occupations\Data\shpfiles\mn_shapefiles\Manhattan_1910.shp' using driver `ESRI Shapefile'
## Simple feature collection with 1480 features and 1 field
## geometry type:  MULTIPOLYGON
## dimension:      XY
## bbox:           xmin: 1823260 ymin: 568821.3 xmax: 1831540 ymax: 591889.7
## epsg (SRID):    NA
## proj4string:    +proj=aea +lat_1=29.5 +lat_2=45.5 +lat_0=37.5 +lon_0=-96 +x_0=0 +y_0=0 +datum=NAD83 +units=m +no_defs
shp_1910 = rbind(bk_1910, mn_1910) %>%
  mutate(ED = as.numeric(ED)) %>%
  st_transform(crs = 4326)

rm(bk_1910)
rm(mn_1910)

2. Analysis of Demographics

Age

Sex

Percent of Labor Force
combined %>%
  group_by(year, sex) %>%
  summarise(count = n()) %>%
  mutate(count = case_when(
    year == 1850 ~ count / var_lf_1850,
    year == 1880 ~ count / var_lf_1880,
    year == 1910 ~ count / var_lf_1910),
    count = round(count * 100,
                  digits = 1)) %>%
  
  ggplot(aes(y = count, x = sex, fill = sex)) +
  geom_col() + 
  geom_text(aes(label = paste0(count, "%")),
            family = "Quicksand",
            vjust = -0.5) +
  facet_wrap(~year) +
  scale_fill_manual(values = col_2) +
  labs(y = "Count", x = "Sex",
       title = "Graph of Sex of Percentage of Labor Force across Time Periods") +
  theme(legend.position = "none") 


Race

Percent of Labor Force
combined %>%
  group_by(year, race) %>%
  summarise(count = n()) %>%
  mutate(count = case_when(
    year == 1850 ~ count / var_lf_1850,
    year == 1880 ~ count / var_lf_1880,
    year == 1910 ~ count / var_lf_1910),
    count = round(count * 100,
                  digits = 1)) %>%
  
  ggplot(aes(y = count, x = race, fill = race)) +
  geom_col() + 
  geom_text(aes(label = paste0(count, "%")),
            family = "Quicksand",
            hjust = 0) +
  facet_wrap(~year, scales = "free_x") +
  coord_flip() +
  scale_y_continuous(expand = expand_scale(mult = c(0, 0.5))) +
  labs(y = "Count", x = "Race",
       title = "Graph of Percentage of Race of Labor Force across Time Periods") +
  theme(legend.position = "none",
        axis.text.x = element_text(size = 7))


City

Percent of Labor Force
combined %>%
  group_by(year, city) %>%
  summarise(count = n()) %>%
  mutate(count = case_when(
    year == 1850 ~ count / var_lf_1850,
    year == 1880 ~ count / var_lf_1880,
    year == 1910 ~ count / var_lf_1910),
    count = round(count * 100,
                  digits = 1)) %>%
  
  ggplot(aes(y = count, x = city, fill = city)) +
  geom_col() + 
  geom_text(aes(label = paste0(count, "%")),
            family = "Quicksand",
            hjust = 0) +
  facet_wrap(~year) +
  coord_flip() +
  scale_fill_manual(values = col_2) +
  scale_y_continuous(expand = expand_scale(mult = c(0, 0.5))) +
  labs(y = "Count", x = "Borough",
       title = "Graph of Percent of Borough of Labor Force across Time Periods") +
  theme(legend.position = "none")


3. Analysis of Occupation

Datatable of General Counts

Overall
combined %>%
  group_by(occ1950_label) %>%
  summarise(count = n()) %>%
  mutate(perc = round(count / var_lf * 100,
                      digits = 2)) %>%
  arrange(desc(count)) %>%
  head(200) %>% # Datatable can only handle a limited amount of data
  datatable()


1850
combined %>%
  filter(year == 1850) %>%
  group_by(occ1950_label) %>%
  summarise(count = n()) %>%
  mutate(perc = round(count / var_lf_1850 * 100,
                      digits = 2)) %>%
  arrange(desc(count)) %>%
  head(200) %>% # Datatable can only handle a limited amount of data
  datatable()


1880
combined %>%
  filter(year == 1880) %>%
  group_by(occ1950_label) %>%
  summarise(count = n()) %>%
  mutate(perc = round(count / var_lf_1880 * 100,
                      digits = 2)) %>%
  arrange(desc(count)) %>%
  head(200) %>% # Datatable can only handle a limited amount of data
  datatable()


1910
combined %>%
  filter(year == 1910) %>%
  group_by(occ1950_label) %>%
  summarise(count = n()) %>%
  mutate(perc = round(count / var_lf_1910 * 100,
                      digits = 2)) %>%
  arrange(desc(count)) %>%
  head(200) %>% # Datatable can only handle a limited amount of data
  datatable()


Graph of General Counts

Percent of Labor Force
combined %>%
  group_by(year, occ1950_label) %>%
  summarise(count = n()) %>%
  group_by(year) %>%
  top_n(10, wt = count) %>%
  mutate(count = case_when(
    year == 1850 ~ count / var_lf_1850,
    year == 1880 ~ count / var_lf_1880,
    year == 1910 ~ count / var_lf_1910),
    count = round(count * 100,
                  digits = 1)) %>%
  
  ggplot(aes(y = count,
             x = reorder_within(occ1950_label, count, year),
             fill = year)) +
  geom_col() +
  geom_text(aes(label = paste0(count, "%")),
            family = "Quicksand",
            hjust = 0) +
  facet_wrap(~year, scales = "free_y") +
  scale_x_reordered() +
  scale_y_continuous(expand = expand_scale(mult = c(0, 0.5))) +
  scale_fill_manual(values = col_3) +
  coord_flip() +
  labs(y = "Count", x = "Occupation",
       title = "Graph of Most Popular Occupations across Time Periods as Percentage of Labor Force") +
  theme(legend.position = "none")


4. Maps of Counts

1850

Leaflet Map of Counts
Distributions of Occupation

Occupation

combined_1850 %>%
  filter(year == 1850 & 
           occ1950_label %in% top_n_occupations(10, 1850)) %>%
  group_by(occ1950_label, ward) %>%
  summarise(count = n()) %>%
  left_join(shp_1850, ., by = c("Ward_Num" = "ward")) %>%
  filter(!is.na(occ1950_label)) %>%
  
  ggplot() +
  geom_sf(aes(fill = count),
          colour = "black",
          size = 0.25) +
  facet_wrap(~occ1950_label, nrow = 2) + 
  scale_fill_viridis_c() + 
  labs(fill = "Count") +
  theme(legend.position = "right",
        axis.text = element_blank(),
        strip.text.x = element_text(size = 7),
        legend.key.height = unit(2,"line"),
        legend.key.width = unit(1,"line"),
        panel.grid = element_blank())


Normalised by size of each ward

combined_1850 %>%
  group_by(ward) %>%
  mutate(ward_size = n()) %>%
  filter(occ1950_label %in% top_n_occupations(10, 1850)) %>%
  group_by(occ1950_label, ward, ward_size) %>%
  summarise(count = n()) %>%
  mutate(count = count / ward_size * 100) %>%
  left_join(shp_1850, ., by = c("Ward_Num" = "ward")) %>%
  filter(!is.na(occ1950_label)) %>%
  
  ggplot() +
  geom_sf(aes(fill = count),
          colour = "black",
          size = 0.25) +
  facet_wrap(~occ1950_label, nrow = 2) + 
  scale_fill_viridis_c() + 
  labs(fill = "Count") +
  theme(legend.position = "right",
        axis.text = element_blank(),
        strip.text.x = element_text(size = 7),
        legend.key.height = unit(2,"line"),
        legend.key.width = unit(1,"line"),
        panel.grid = element_blank())


Grouped

combined_1850 %>%
  filter(year == 1850 & 
           occ1950_label %in% top_n_occupations(10, 1850)) %>%
  group_by(occ1950_label, ward) %>%
  summarise(count = n()) %>%
  mutate(count = case_when(
    count <=  500 ~ 1,
    count <= 1000 ~ 2,
    count <= 1500 ~ 3,
    count <= 2000 ~ 4,
    count <= Inf ~ 5
  ),
         count = factor(count,
                        levels = 1:5,
                        labels = c("   0 -  500", "501 - 1000",
                                   "1001 - 1500", "1501 - 2000",
                                   "2001 -  Inf"))) %>%
  left_join(shp_1850, ., by = c("Ward_Num" = "ward")) %>%
  filter(!is.na(occ1950_label)) %>%
  
  ggplot() +
  geom_sf(aes(fill = count),
          colour = "black",
          size = 0.25) +
  facet_wrap(~occ1950_label, nrow = 2) + 
  scale_fill_viridis_d() + 
  labs(fill = "Count") +
  theme(legend.position = "right",
        axis.text = element_blank(),
        strip.text.x = element_text(size = 7),
        legend.key.height = unit(2,"line"),
        legend.key.width = unit(1,"line"),
        panel.grid = element_blank())


Normalised and grouped by size of each ward

combined_1850 %>%
  group_by(ward) %>%
  mutate(ward_size = n()) %>%
  filter(occ1950_label %in% top_n_occupations(10, 1850)) %>%
  group_by(occ1950_label, ward, ward_size) %>%
  summarise(count = n()) %>%
  mutate(count = count / ward_size * 100,
         count = case_when(
           count <= 5 ~ 1,
           count <= 10 ~ 2,
           count <= 15 ~ 3,
           count <= 20 ~ 4,
           count <= 100 ~ 5
         ),
         count = factor(count,
                        levels = 1:5,
                        labels = c(" 0 - 5", " 6 - 10",
                                   "11 - 15", "16 - 20",
                                   "21 - 100"))) %>%
  left_join(shp_1850, ., by = c("Ward_Num" = "ward")) %>%
  filter(!is.na(occ1950_label)) %>%
  
  ggplot() +
  geom_sf(aes(fill = count),
          colour = "black",
          size = 0.25) +
  facet_wrap(~occ1950_label, nrow = 2) + 
  scale_fill_viridis_d() + 
  labs(fill = "Count") +
  theme(legend.position = "right",
        axis.text = element_blank(),
        strip.text.x = element_text(size = 7),
        legend.key.height = unit(2,"line"),
        legend.key.width = unit(1,"line"),
        panel.grid = element_blank())


Distributions of Race

Race

combined_1850 %>%
  group_by(race, ward) %>%
  summarise(count = n()) %>%
  left_join(shp_1850, ., by = c("Ward_Num" = "ward")) %>%
  filter(!is.na(race)) %>%
  
  ggplot() +
  geom_sf(aes(fill = count),
          colour = "black",
          size = 0.25) +
  facet_wrap(~race, nrow = 2) + 
  scale_fill_viridis_c() + 
  labs(fill = "Count") +
  theme(legend.position = "right",
        axis.text = element_blank(),
        strip.text.x = element_text(size = 7),
        legend.key.height = unit(2,"line"),
        legend.key.width = unit(1,"line"),
        panel.grid = element_blank())


Normalised by size of each ward

combined_1850 %>%
  group_by(ward) %>%
  mutate(ward_size = n()) %>%
  group_by(race, ward, ward_size) %>%
  summarise(count = n()) %>%
  mutate(count = count / ward_size * 100) %>%
  left_join(shp_1850, ., by = c("Ward_Num" = "ward")) %>%
  filter(!is.na(race)) %>%
  
  ggplot() +
  geom_sf(aes(fill = count),
          colour = "black",
          size = 0.25) +
  facet_wrap(~race, nrow = 2) + 
  scale_fill_viridis_c() + 
  labs(fill = "Count") +
  theme(legend.position = "right",
        axis.text = element_blank(),
        strip.text.x = element_text(size = 7),
        legend.key.height = unit(2,"line"),
        legend.key.width = unit(1,"line"),
        panel.grid = element_blank())


Grouped

combined_1850 %>%
  group_by(race, ward) %>%
  summarise(count = n()) %>%
  mutate(count = case_when(
    count <= 5000 ~ 1,
    count <= 7500 ~ 2,
    count <= 10000 ~ 3,
    count <= 12500 ~ 4,
    count <= Inf ~ 5
  ),
         count = factor(count,
                        levels = 1:5,
                        labels = c("    0 -  5000", " 5001 -  7500",
                                   " 7501 - 10000", "10001 - 12500",
                                   "12501 -   Inf"))) %>%
  left_join(shp_1850, ., by = c("Ward_Num" = "ward")) %>%
  filter(!is.na(race)) %>%
  
  ggplot() +
  geom_sf(aes(fill = count),
          colour = "black",
          size = 0.25) +
  facet_wrap(~race, nrow = 2) + 
  scale_fill_viridis_d() + 
  labs(fill = "Count") +
  theme(legend.position = "right",
        axis.text = element_blank(),
        strip.text.x = element_text(size = 7),
        legend.key.height = unit(2,"line"),
        legend.key.width = unit(1,"line"),
        panel.grid = element_blank())


Normalised and grouped by size of each ward

combined_1850 %>%
  group_by(ward) %>%
  mutate(ward_size = n()) %>%
  group_by(race, ward, ward_size) %>%
  summarise(count = n()) %>%
  mutate(count = count / ward_size * 100,
         count = case_when(
           count <= 80 ~ 1,
           count <= 85 ~ 2,
           count <= 90 ~ 3,
           count <= 95 ~ 4,
           count <= 100 ~ 5
         ),
         count = factor(count,
                        levels = 1:5,
                        labels = c(" 0 - 80", " 81 - 85",
                                   "86 - 90", "91 - 95",
                                   "96 - 100"))) %>%
  left_join(shp_1850, ., by = c("Ward_Num" = "ward")) %>%
  filter(!is.na(race)) %>%
  
  ggplot() +
  geom_sf(aes(fill = count),
          colour = "black",
          size = 0.25) +
  facet_wrap(~race, nrow = 2) + 
  scale_fill_viridis_d() + 
  labs(fill = "Count") +
  theme(legend.position = "right",
        axis.text = element_blank(),
        strip.text.x = element_text(size = 7),
        legend.key.height = unit(2,"line"),
        legend.key.width = unit(1,"line"),
        panel.grid = element_blank())


1880

Leaflet Map of Counts
Distributions of Occupation


Occupation

combined %>%
  filter(year == 1880 & 
           occ1950_label %in% top_n_occupations(10, 1880)) %>%
  group_by(occ1950_label, enumdist) %>%
  summarise(count = n()) %>%
  left_join(shp_1880, ., by = c("ED" = "enumdist")) %>%
  filter(!is.na(occ1950_label)) %>%
  
  ggplot() +
  geom_sf(aes(fill = count),
          colour = "black",
          size = 0.25) +
  facet_wrap(~occ1950_label, nrow = 2) + 
  scale_fill_viridis_c() + 
  labs(fill = "Count") +
  theme(legend.position = "right",
        axis.text = element_blank(),
        strip.text.x = element_text(size = 7),
        legend.key.height = unit(2,"line"),
        legend.key.width = unit(1,"line"),
        panel.grid = element_blank())


Normalised by size of each enumeration district

combined %>%
  filter(year == 1880) %>%
  group_by(enumdist) %>%
  mutate(enum_size = n()) %>%
  filter(occ1950_label %in% top_n_occupations(10, 1880)) %>%
  group_by(occ1950_label, enumdist, enum_size) %>%
  summarise(count = n()) %>%
  mutate(count = count / enum_size * 100) %>%
  left_join(shp_1880, ., by = c("ED" = "enumdist")) %>%
  filter(!is.na(occ1950_label)) %>%
  
  ggplot() +
  geom_sf(aes(fill = count),
          colour = "black",
          size = 0.25) +
  facet_wrap(~occ1950_label, nrow = 2) + 
  scale_fill_viridis_c() + 
  labs(fill = "Count") +
  theme(legend.position = "right",
        axis.text = element_blank(),
        strip.text.x = element_text(size = 7),
        legend.key.height = unit(2,"line"),
        legend.key.width = unit(1,"line"),
        panel.grid = element_blank())


Grouped

combined %>%
  filter(year == 1880 & 
           occ1950_label %in% top_n_occupations(10, 1880)) %>%
  group_by(occ1950_label, enumdist) %>%
  summarise(count = n()) %>%
  mutate(count = case_when(
    count <= 100 ~ 1,
    count <= 200 ~ 2,
    count <= 300 ~ 3,
    count <= 400 ~ 4,
    count <= Inf ~ 5
  ),
         count = factor(count,
                        levels = 1:5,
                        labels = c("  0 - 100", "101 - 200",
                                   "201 - 300", "301 - 400",
                                   "401 - Inf"))) %>%
  
  left_join(shp_1880, ., by = c("ED" = "enumdist")) %>%
  filter(!is.na(occ1950_label)) %>%
  
  ggplot() +
  geom_sf(aes(fill = count),
          colour = "black",
          size = 0.25) +
  facet_wrap(~occ1950_label, nrow = 2) +
  scale_fill_viridis_d() +
  labs(fill = "Count") +
  theme(legend.position = "right",
        axis.text = element_blank(),
        strip.text.x = element_text(size = 7),
        legend.key.height = unit(2,"line"),
        legend.key.width = unit(1,"line"),
        panel.grid = element_blank())


Grouped and Normalised by size of each enumeration district

combined %>%
  filter(year == 1880) %>%
  group_by(enumdist) %>%
  mutate(enum_size = n()) %>%
  filter(occ1950_label %in% top_n_occupations(10, 1880)) %>%
  group_by(occ1950_label, enumdist, enum_size) %>%
  summarise(count = n()) %>%
  mutate(count = count / enum_size * 100,
         count = case_when(
    count <= 15 ~ 1,
    count <= 30 ~ 2,
    count <= 45 ~ 3,
    count <= 60 ~ 4,
    count <= 100 ~ 5
  ),
         count = factor(count,
                        levels = 1:5,
                        labels = c(" 0 - 15", "16 - 30",
                                   "31 - 45", "46 - 60",
                                   "61 - 100"))) %>%
  left_join(shp_1880, ., by = c("ED" = "enumdist")) %>%
  filter(!is.na(occ1950_label)) %>%
  
  ggplot() +
  geom_sf(aes(fill = count),
          colour = "black",
          size = 0.25) +
  facet_wrap(~occ1950_label, nrow = 2) + 
  scale_fill_viridis_d() + 
  labs(fill = "Count") +
  theme(legend.position = "right",
        axis.text = element_blank(),
        strip.text.x = element_text(size = 7),
        legend.key.height = unit(2,"line"),
        legend.key.width = unit(1,"line"),
        panel.grid = element_blank())


Distributions of Race


Race

combined %>%
  filter(year == 1880) %>%
  group_by(race, enumdist) %>%
  summarise(count = n()) %>%
  left_join(shp_1880, ., by = c("ED" = "enumdist")) %>%
  filter(!is.na(race)) %>%
  
  ggplot() +
  geom_sf(aes(fill = count), 
          colour = "black",
          size = 0.25) +
  facet_wrap(~race, nrow = 2) + 
  scale_fill_viridis_c() + 
  labs(fill = "Count") +
  theme(legend.position = "right",
        axis.text = element_blank(),
        strip.text.x = element_text(size = 7),
        legend.key.height = unit(2,"line"),
        legend.key.width = unit(1,"line"),
        panel.grid = element_blank())


Normalised by size of each enumeration district

combined %>%
  filter(year == 1880) %>%
  group_by(enumdist) %>%
  mutate(enum_size = n()) %>%
  group_by(race, enumdist, enum_size) %>%
  summarise(count = n()) %>%
  mutate(count = count / enum_size * 100) %>%
  left_join(shp_1880, ., by = c("ED" = "enumdist")) %>%
  filter(!is.na(race)) %>%
  
  ggplot() +
  geom_sf(aes(fill = count),
          colour = "black",
          size = 0.25) +
  facet_wrap(~race, nrow = 2) + 
  scale_fill_viridis_c() + 
  labs(fill = "Count") +
  theme(legend.position = "right",
        axis.text = element_blank(),
        strip.text.x = element_text(size = 7),
        legend.key.height = unit(2,"line"),
        legend.key.width = unit(1,"line"),
        panel.grid = element_blank())


Grouped

combined %>%
  filter(year == 1880) %>%
  group_by(race, enumdist) %>%
  summarise(count = n()) %>%
  mutate(count = case_when(
    count <= 500 ~ 1,
    count <= 1000 ~ 2,
    count <= 1500 ~ 3,
    count <= 2000 ~ 4,
    count <= Inf ~ 5
  ),
  count = factor(count,
                 levels = 1:5,
                 labels = c("   0 -  500", " 501 - 1000",
                            "1001 - 1500", "1501 - 2000",
                            "2001 -  Inf"))) %>%
  
  left_join(shp_1880, ., by = c("ED" = "enumdist")) %>%
  filter(!is.na(race)) %>%
  
  ggplot() +
  geom_sf(aes(fill = count), 
          colour = "black",
          size = 0.25) +
  facet_wrap(~race, nrow = 2) + 
  scale_fill_viridis_d() + 
  labs(fill = "Count") +
  theme(legend.position = "right",
        axis.text = element_blank(),
        strip.text.x = element_text(size = 7),
        legend.key.height = unit(2,"line"),
        legend.key.width = unit(1,"line"),
        panel.grid = element_blank())


Grouped and Normalised by size of each enumeration district

combined %>%
  filter(year == 1880) %>%
  group_by(enumdist) %>%
  mutate(enum_size = n()) %>%
  group_by(race, enumdist, enum_size) %>%
  summarise(count = n()) %>%
  mutate(count = count / enum_size * 100,
         count = case_when(
           count <= 20 ~ 1,
           count <= 40 ~ 2,
           count <= 60 ~ 3,
           count <= 80 ~ 4,
           count <= 100 ~ 5
         ),
         count = factor(count,
                        levels = 1:5,
                        labels = c(" 0 - 20", "21 - 40",
                                   "41 - 60", "61 - 80",
                                   "81 - 100"))) %>%
  left_join(shp_1880, ., by = c("ED" = "enumdist")) %>%
  filter(!is.na(race)) %>%
  
  ggplot() +
  geom_sf(aes(fill = count),
          colour = "black",
          size = 0.25) +
  facet_wrap(~race, nrow = 2) + 
  scale_fill_viridis_d() + 
  labs(fill = "Count") +
  theme(legend.position = "right",
        axis.text = element_blank(),
        strip.text.x = element_text(size = 7),
        legend.key.height = unit(2,"line"),
        legend.key.width = unit(1,"line"),
        panel.grid = element_blank())


1910

Leaflet Map of Counts
sub = combined %>%
  filter(year == 1910) %>%
  group_by(enumdist) %>%
  summarise(count = n()) %>% 
  left_join(shp_1910, ., by = c("ED" = "enumdist"))

bins <- c(0, 500, 1000, 1500, 2000, 2500, 3000)
pal <- colorBin("YlOrRd", 
                domain = sub$count, 
                bins = bins)

label = paste("<b>Count:</b>", sub$count) %>% 
  lapply(htmltools::HTML)

sub %>%
  leaflet() %>%
  addProviderTiles("CartoDB.Positron") %>%
  setView(lat = 40.7128, lng = -73.9500, zoom = 11) %>%
  addPolygons(fillColor = ~pal(count),
              fillOpacity = 0.7,
              weight = 0.25,
              color = "black",
              label = label) %>%
  addLegend(pal = pal,
            values = ~count,
            position = "bottomright")


Distributions of Occupation


Occupation

combined %>%
  filter(year == 1910 & 
           occ1950_label %in% top_n_occupations(10, 1910)) %>%
  group_by(occ1950_label, enumdist) %>%
  summarise(count = n()) %>%
  left_join(shp_1910, ., by = c("ED" = "enumdist")) %>%
  filter(!is.na(occ1950_label)) %>%
  
  ggplot() +
  geom_sf(aes(fill = count),
          colour = "black",
          size = 0.25) +
  facet_wrap(~occ1950_label, nrow = 2) + 
  scale_fill_viridis_c() + 
  labs(fill = "Count") +
  theme(legend.position = "right",
        axis.text = element_blank(),
        strip.text.x = element_text(size = 7),
        legend.key.height = unit(2,"line"),
        legend.key.width = unit(1,"line"),
        panel.grid = element_blank())


Normalised by size of each enumeration district

combined %>%
  filter(year == 1910) %>%
  group_by(enumdist) %>%
  mutate(enum_size = n()) %>%
  filter(occ1950_label %in% top_n_occupations(10, 1910)) %>%
  group_by(occ1950_label, enumdist, enum_size) %>%
  summarise(count = n()) %>%
  mutate(count = count / enum_size * 100) %>%
  left_join(shp_1910, ., by = c("ED" = "enumdist")) %>%
  filter(!is.na(occ1950_label)) %>%
  
  ggplot() +
  geom_sf(aes(fill = count),
          colour = "black",
          size = 0.25) +
  facet_wrap(~occ1950_label, nrow = 2) + 
  scale_fill_viridis_c() + 
  labs(fill = "Count") +
  theme(legend.position = "right",
        axis.text = element_blank(),
        strip.text.x = element_text(size = 7),
        legend.key.height = unit(2,"line"),
        legend.key.width = unit(1,"line"),
        panel.grid = element_blank())


Grouped

combined %>%
  filter(year == 1910 & 
           occ1950_label %in% top_n_occupations(10, 1910)) %>%
  group_by(occ1950_label, enumdist) %>%
  summarise(count = n()) %>%
  mutate(count = case_when(
    count <= 100 ~ 1,
    count <= 200 ~ 2,
    count <= 300 ~ 3,
    count <= 400 ~ 4,
    count <= Inf ~ 5
  ),
         count = factor(count,
                        levels = 1:5,
                        labels = c("  0 - 100", "101 - 200",
                                   "201 - 300", "301 - 400",
                                   "401 - Inf"))) %>%
  
  left_join(shp_1910, ., by = c("ED" = "enumdist")) %>%
  filter(!is.na(occ1950_label)) %>%
  
  ggplot() +
  geom_sf(aes(fill = count),
          colour = "black",
          size = 0.25) +
  facet_wrap(~occ1950_label, nrow = 2) +
  scale_fill_viridis_d() +
  labs(fill = "Count") +
  theme(legend.position = "right",
        axis.text = element_blank(),
        strip.text.x = element_text(size = 7),
        legend.key.height = unit(2,"line"),
        legend.key.width = unit(1,"line"),
        panel.grid = element_blank())


Grouped and Normalised by size of each enumeration district

combined %>%
  filter(year == 1910) %>%
  group_by(enumdist) %>%
  mutate(enum_size = n()) %>%
  filter(occ1950_label %in% top_n_occupations(10, 1910)) %>%
  group_by(occ1950_label, enumdist, enum_size) %>%
  summarise(count = n()) %>%
  mutate(count = count / enum_size * 100,
         count = case_when(
    count <= 20 ~ 1,
    count <= 40 ~ 2,
    count <= 60 ~ 3,
    count <= 80 ~ 4,
    count <= 100 ~ 5
  ),
         count = factor(count,
                        levels = 1:5,
                        labels = c(" 0 - 20", "21 - 40",
                                   "41 - 60", "61 - 80",
                                   "81 - 100"))) %>%
  left_join(shp_1910, ., by = c("ED" = "enumdist")) %>%
  filter(!is.na(occ1950_label)) %>%
  
  ggplot() +
  geom_sf(aes(fill = count),
          colour = "black",
          size = 0.25) +
  facet_wrap(~occ1950_label, nrow = 2) + 
  scale_fill_viridis_d() + 
  labs(fill = "Count") +
  theme(legend.position = "right",
        axis.text = element_blank(),
        strip.text.x = element_text(size = 7),
        legend.key.height = unit(2,"line"),
        legend.key.width = unit(1,"line"),
        panel.grid = element_blank())


Distributions of Race


Race

combined %>%
  filter(year == 1910) %>%
  group_by(race, enumdist) %>%
  summarise(count = n()) %>%
  left_join(shp_1910, ., by = c("ED" = "enumdist")) %>%
  filter(!is.na(race)) %>%
  
  ggplot() +
  geom_sf(aes(fill = count), 
          colour = "black",
          size = 0.25) +
  facet_wrap(~race, nrow = 2) + 
  scale_fill_viridis_c() + 
  labs(fill = "Count") +
  theme(legend.position = "right",
        axis.text = element_blank(),
        strip.text.x = element_text(size = 7),
        legend.key.height = unit(2,"line"),
        legend.key.width = unit(1,"line"),
        panel.grid = element_blank())


Normalised by size of each enumeration district

combined %>%
  filter(year == 1910) %>%
  group_by(enumdist) %>%
  mutate(enum_size = n()) %>%
  group_by(race, enumdist, enum_size) %>%
  summarise(count = n()) %>%
  mutate(count = count / enum_size * 100) %>%
  left_join(shp_1910, ., by = c("ED" = "enumdist")) %>%
  filter(!is.na(race)) %>%
  
  ggplot() +
  geom_sf(aes(fill = count),
          colour = "black",
          size = 0.25) +
  facet_wrap(~race, nrow = 2) + 
  scale_fill_viridis_c() + 
  labs(fill = "Count") +
  theme(legend.position = "right",
        axis.text = element_blank(),
        strip.text.x = element_text(size = 7),
        legend.key.height = unit(2,"line"),
        legend.key.width = unit(1,"line"),
        panel.grid = element_blank())


Grouped

combined %>%
  filter(year == 1910) %>%
  group_by(race, enumdist) %>%
  summarise(count = n()) %>%
  mutate(count = case_when(
    count <= 500 ~ 1,
    count <= 1000 ~ 2,
    count <= 1500 ~ 3,
    count <= 2000 ~ 4,
    count <= Inf ~ 5
  ),
  count = factor(count,
                 levels = 1:5,
                 labels = c("   0 -  500", " 501 - 1000",
                            "1001 - 1500", "1501 - 2000",
                            "2001 -  Inf"))) %>%
  
  left_join(shp_1910, ., by = c("ED" = "enumdist")) %>%
  filter(!is.na(race)) %>%
  
  ggplot() +
  geom_sf(aes(fill = count), 
          colour = "black",
          size = 0.25) +
  facet_wrap(~race, nrow = 2) + 
  scale_fill_viridis_d() + 
  labs(fill = "Count") +
  theme(legend.position = "right",
        axis.text = element_blank(),
        strip.text.x = element_text(size = 7),
        legend.key.height = unit(2,"line"),
        legend.key.width = unit(1,"line"),
        panel.grid = element_blank())


Grouped and Normalised by size of each enumeration district

combined %>%
  filter(year == 1910) %>%
  group_by(enumdist) %>%
  mutate(enum_size = n()) %>%
  group_by(race, enumdist, enum_size) %>%
  summarise(count = n()) %>%
  mutate(count = count / enum_size * 100,
         count = case_when(
           count <= 20 ~ 1,
           count <= 40 ~ 2,
           count <= 60 ~ 3,
           count <= 80 ~ 4,
           count <= 100 ~ 5
         ),
         count = factor(count,
                        levels = 1:5,
                        labels = c(" 0 - 20", "21 - 40",
                                   "41 - 60", "61 - 80",
                                   "81 - 100"))) %>%
  left_join(shp_1910, ., by = c("ED" = "enumdist")) %>%
  filter(!is.na(race)) %>%
  
  ggplot() +
  geom_sf(aes(fill = count),
          colour = "black",
          size = 0.25) +
  facet_wrap(~race, nrow = 2) + 
  scale_fill_viridis_d() + 
  labs(fill = "Count") +
  theme(legend.position = "right",
        axis.text = element_blank(),
        strip.text.x = element_text(size = 7),
        legend.key.height = unit(2,"line"),
        legend.key.width = unit(1,"line"),
        panel.grid = element_blank())